Urllib库¶
内置的HTTP请求库¶
- 请求模块: urllib.request
- 异常处理模块: urllib.error
- url解析模块:urllib.parse
- robots.txt解析模块:urllib.robotparser
- 代理: ProxyHandler (翻墙)
- Cookie: http.cookiejar
- 网址拼接: urljoin
- 将字典转换成请求参数: urlencode
urllib¶
urlopen¶
urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)
1 2 3 4 | import urllib.request #请求模块 response = urllib.request.urlopen('http://www.baidu.com') #打开这个url地址,get请求 print(response.read().decode('utf-8')) #读取源代码,这里解码用utf-8 |
1 | <!DOCTYPE html><!--STATUS OK--> |
1 | </html> |
1 2 3 4 5 6 | import urllib.parse #url解析模块 import urllib.request data = bytes(urllib.parse.urlencode({'word': 'hello'}), encoding='utf8') #转换成二进制,这里传递过去word以及hello response = urllib.request.urlopen('http://httpbin.org/post', data=data)#加上data是一种post请求 print(response.read()) |
1 | b'{\n "args": {}, \n "data": "", \n "files": {}, \n "form": {\n "word": "hello"\n }, \n "headers": {\n "Accept-Encoding": "identity", \n "Content-Length": "10", \n "Content-Type": "application/x-www-form-urlencoded", \n "Host": "httpbin.org", \n "User-Agent": "Python-urllib/3.6", \n "X-Amzn-Trace-Id": "Root=1-5f0d3ca2-bb53229b018e17799e02b1ae"\n }, \n "json": null, \n "origin": "183.207.182.162", \n "url": "http://httpbin.org/post"\n}\n' |
1 2 3 4 | import urllib.request response = urllib.request.urlopen('http://httpbin.org/get', timeout=1) #timeout超时响应时间 print(response.read()) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | --------------------------------------------------------------------------- timeout Traceback (most recent call last) <ipython-input-9-624debaefd14> in <module> 1 import urllib.request 2 ----> 3 response = urllib.request.urlopen('http://httpbin.org/get', timeout=1) #timeout超时响应时间 4 print(response.read()) D:\Anaconda3\envs\CPU\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context) 221 else: 222 opener = _opener --> 223 return opener.open(url, data, timeout) 224 225 def install_opener(opener): D:\Anaconda3\envs\CPU\lib\urllib\request.py in open(self, fullurl, data, timeout) 524 req = meth(req) 525 --> 526 response = self._open(req, data) 527 528 # post-process response D:\Anaconda3\envs\CPU\lib\urllib\request.py in _open(self, req, data) 542 protocol = req.type 543 result = self._call_chain(self.handle_open, protocol, protocol + --> 544 '_open', req) 545 if result: 546 return result D:\Anaconda3\envs\CPU\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args) 502 for handler in handlers: 503 func = getattr(handler, meth_name) --> 504 result = func(*args) 505 if result is not None: 506 return result D:\Anaconda3\envs\CPU\lib\urllib\request.py in http_open(self, req) 1344 1345 def http_open(self, req): -> 1346 return self.do_open(http.client.HTTPConnection, req) 1347 1348 http_request = AbstractHTTPHandler.do_request_ D:\Anaconda3\envs\CPU\lib\urllib\request.py in do_open(self, http_class, req, **http_conn_args) 1319 except OSError as err: # timeout error 1320 raise URLError(err) -> 1321 r = h.getresponse() 1322 except: 1323 h.close() D:\Anaconda3\envs\CPU\lib\http\client.py in getresponse(self) 1352 try: 1353 try: -> 1354 response.begin() 1355 except ConnectionError: 1356 self.close() D:\Anaconda3\envs\CPU\lib\http\client.py in begin(self) 305 # read until we get a non-100 response 306 while True: --> 307 version, status, reason = self._read_status() 308 if status != CONTINUE: 309 break D:\Anaconda3\envs\CPU\lib\http\client.py in _read_status(self) 266 267 def _read_status(self): --> 268 line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1") 269 if len(line) > _MAXLINE: 270 raise LineTooLong("status line") D:\Anaconda3\envs\CPU\lib\socket.py in readinto(self, b) 584 while True: 585 try: --> 586 return self._sock.recv_into(b) 587 except timeout: 588 self._timeout_occurred = True timeout: timed out |
1 2 3 4 5 6 7 8 9 | import socket import urllib.request import urllib.error try: response = urllib.request.urlopen('http://httpbin.org/get', timeout=0.1) except urllib.error.URLError as e: #捕获异常 if isinstance(e.reason, socket.timeout): print('TIME OUT') |
1 | TIME OUT |
响应¶
响应类型¶
1 2 3 4 | import urllib.request response = urllib.request.urlopen('https://www.python.org') print(type(response)) #响应的类型 |
1 | <class 'http.client.HTTPResponse'> |
状态码、响应头¶
1 2 3 4 5 6 | import urllib.request response = urllib.request.urlopen('https://www.python.org') print(response.status) #状态码 print(response.getheaders()) #响应头 print(response.getheader('Server')) #使用的服务器的类型 |
1 2 3 | 200 [('Connection', 'close'), ('Content-Length', '48997'), ('Server', 'nginx'), ('Content-Type', 'text/html; charset=utf-8'), ('X-Frame-Options', 'DENY'), ('Via', '1.1 vegur'), ('Via', '1.1 varnish'), ('Accept-Ranges', 'bytes'), ('Date', 'Tue, 14 Jul 2020 05:03:52 GMT'), ('Via', '1.1 varnish'), ('Age', '3128'), ('X-Served-By', 'cache-bwi5126-BWI, cache-hkg17920-HKG'), ('X-Cache', 'HIT, HIT'), ('X-Cache-Hits', '42, 1733'), ('X-Timer', 'S1594703032.137189,VS0,VE0'), ('Vary', 'Cookie'), ('Strict-Transport-Security', 'max-age=63072000; includeSubDomains')] nginx |
1 2 3 4 | import urllib.request response = urllib.request.urlopen('https://www.python.org') print(response.read().decode('utf-8')) #response.read()获取响应体的内容 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | <!doctype html> <!--[if lt IE 7]> <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9"> <![endif]--> <!--[if IE 7]> <html class="no-js ie7 lt-ie8 lt-ie9"> <![endif]--> <!--[if IE 8]> <html class="no-js ie8 lt-ie9"> <![endif]--> <!--[if gt IE 8]><!--><html class="no-js" lang="en" dir="ltr"> <!--<![endif]--> <head> <meta charset="utf-8"> <meta http-equiv="X-UA-Compatible" content="IE=edge"> <link rel="prefetch" href="//ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js"> <meta name="application-name" content="Python.org"> <meta name="msapplication-tooltip" content="The official home of the Python Programming Language"> <meta name="apple-mobile-web-app-title" content="Python.org"> <meta name="apple-mobile-web-app-capable" content="yes"> <meta name="apple-mobile-web-app-status-bar-style" content="black"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <meta name="HandheldFriendly" content="True"> <meta name="format-detection" content="telephone=no"> <meta http-equiv="cleartype" content="on"> <meta http-equiv="imagetoolbar" content="false"> <script src="/static/js/libs/modernizr.js"></script> <link href="/static/stylesheets/style.30afed881237.css" rel="stylesheet" type="text/css" title="default" /> <link href="/static/stylesheets/mq.eef77a5d2257.css" rel="stylesheet" type="text/css" media="not print, braille, embossed, speech, tty" /> |
<!--[if (lte IE 8)&(!IEMobile)]>
<![endif]→
1 2 3 4 | <title>Welcome to Python.org</title> <meta name="description" content="The official home of the Python Programming Language"> <meta name="keywords" content="Python programming language object oriented web free open source software license documentation download community"> |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | <meta property="og:image" content="https://www.python.org/static/opengraph-icon-200x200.png"> <meta property="og:image:secure_url" content="https://www.python.org/static/opengraph-icon-200x200.png"> <meta property="og:url" content="https://www.python.org/"> <link rel="author" href="/static/humans.txt"> <link rel="alternate" type="application/rss+xml" title="Python Enhancement Proposals" href="https://www.python.org/dev/peps/peps.rss/"> <link rel="alternate" type="application/rss+xml" title="Python Job Opportunities" href="https://www.python.org/jobs/feed/rss/"> <link rel="alternate" type="application/rss+xml" title="Python Software Foundation News" href="https://feeds.feedburner.com/PythonSoftwareFoundationNews"> <link rel="alternate" type="application/rss+xml" title="Python Insider" href="https://feeds.feedburner.com/PythonInsider"> |
1 2 3 4 | <script src="/static/js/libs/masonry.pkgd.min.js"></script> <script src="/static/js/libs/html-includes.js"></script> <script type="text/javascript" src="/static/js/main-min.a3326162e3f0.js" charset="utf-8"></script> |
<!--[if lte IE 7]>
<![endif]→
1 2 | <!--[if lte IE 8]> <script type="text/javascript" src="/static/js/plugins/getComputedStyle-min.c3860be1d290.js" charset="utf-8"></script> |
<![endif]→